from IPython.core.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:90% }</style>"))
import warnings
warnings.filterwarnings('ignore')
# ------------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
# load the data into a pandas dataframe
df = pd.read_csv('/content/airbnb_listings_austin.csv')
# preview data
df.info()
df.head
print(df)
# total count of NaN values
print((df.isnull().sum()/df.shape[0])*100)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5835 entries, 0 to 5834
Data columns (total 54 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 5835 non-null int64
1 listing_url 5835 non-null object
2 name 5835 non-null object
3 summary 5373 non-null object
4 space 4475 non-null object
5 description 5832 non-null object
6 experiences_offered 5835 non-null object
7 neighborhood_overview 3572 non-null object
8 notes 2412 non-null object
9 transit 3492 non-null object
10 host_id 5835 non-null int64
11 host_name 5820 non-null object
12 host_since 5820 non-null object
13 host_location 5810 non-null object
14 host_about 3974 non-null object
15 host_response_time 4177 non-null object
16 host_response_rate 4177 non-null object
17 host_is_superhost 5820 non-null object
18 host_listings_count 5820 non-null float64
19 host_has_profile_pic 5820 non-null object
20 host_identity_verified 5820 non-null object
21 neighbourhood 4800 non-null object
22 city 5835 non-null object
23 property_type 5835 non-null object
24 room_type 5835 non-null object
25 accommodates 5835 non-null int64
26 bathrooms 5789 non-null float64
27 bedrooms 5829 non-null float64
28 beds 5812 non-null float64
29 bed_type 5835 non-null object
30 amenities 5835 non-null object
31 square_feet 302 non-null float64
32 price 5835 non-null object
33 weekly_price 2227 non-null object
34 security_deposit 2770 non-null object
35 cleaning_fee 3587 non-null object
36 guests_included 5835 non-null int64
37 extra_people 5835 non-null object
38 minimum_nights 5835 non-null int64
39 has_availability 5835 non-null object
40 availability_30 5835 non-null int64
41 availability_60 5835 non-null int64
42 availability_90 5835 non-null int64
43 availability_365 5835 non-null int64
44 number_of_reviews 5835 non-null int64
45 review_scores_rating 3789 non-null float64
46 review_scores_accuracy 3776 non-null float64
47 review_scores_cleanliness 3778 non-null float64
48 review_scores_checkin 3778 non-null float64
49 review_scores_communication 3778 non-null float64
50 review_scores_location 3779 non-null float64
51 review_scores_value 3778 non-null float64
52 instant_bookable 5835 non-null object
53 cancellation_policy 5835 non-null object
dtypes: float64(12), int64(10), object(32)
memory usage: 2.4+ MB
id listing_url \
0 72635 https://www.airbnb.com/rooms/72635
1 5386323 https://www.airbnb.com/rooms/5386323
2 8826517 https://www.airbnb.com/rooms/8826517
3 8828616 https://www.airbnb.com/rooms/8828616
4 8536913 https://www.airbnb.com/rooms/8536913
... ... ...
5830 6063670 https://www.airbnb.com/rooms/6063670
5831 8422925 https://www.airbnb.com/rooms/8422925
5832 3345881 https://www.airbnb.com/rooms/3345881
5833 8954997 https://www.airbnb.com/rooms/8954997
5834 7618185 https://www.airbnb.com/rooms/7618185
name \
0 3 Private Bedrooms, SW Austin
1 Cricket Trailer
2 Private room 1 in South Austin
3 Private room 2 in South Austin
4 Brand-New 3BR Austin Home
... ...
5830 Austin's Downtown Garden Suite
5831 Two beds in Downtown Austin!
5832 Casa Romántica en Picos de Europa
5833 Living room with bed
5834 Comfy 1 bedroom in North Austin
summary \
0 Conveniently located 10-15 from downtown in SW...
1 Rent this cool concept trailer that has everyt...
2 Upstairs, private, 12ft x 13 1/2ft room. Priv...
3 Upstairs, private, 11ft x 13 1/2ft room. Priv...
4 Brand-new 3BR/2BA Austin home with landscaped ...
... ...
5830 Enjoy being literally steps from everything th...
5831 Prime location for the Austin Convention Cente...
5832 Axtur: Picos de Europa. Desfiladero del Sella ...
5833 Living room with bed have bathroom.
5834 NaN
space \
0 We have three spare bedrooms, each with a quee...
1 Rental arrangements for this trailer allows yo...
2 NaN
3 NaN
4 Feel instantly at home at our brand new 3BR/2B...
... ...
5830 If you are looking for the perfect suite in th...
5831 Located in the heart of downtown, this room co...
5832 Una casa excepcional en un paisaje excepcional...
5833 NaN
5834 Cozy one bedroom/one bath 1st floor apartment ...
description experiences_offered \
0 Conveniently located 10-15 from downtown in SW... none
1 Rent this cool concept trailer that has everyt... none
2 Upstairs, private, 12ft x 13 1/2ft room. Priv... none
3 Upstairs, private, 11ft x 13 1/2ft room. Priv... none
4 Brand-new 3BR/2BA Austin home with landscaped ... none
... ... ...
5830 Enjoy being literally steps from everything th... none
5831 Prime location for the Austin Convention Cente... none
5832 Una casa excepcional en un paisaje excepcional... none
5833 Living room with bed have bathroom. none
5834 Cozy one bedroom/one bath 1st floor apartment ... none
neighborhood_overview \
0 Location and convenience are key. Easy access...
1 We're talking about wherever you'd like in the...
2 NaN
3 NaN
4 Entertainment and activities are plentiful her...
... ...
5830 I love that the downtown neighborhood is so vi...
5831 This truly is in the middle of everything goin...
5832 Pueblecito asturiano, con muy pocos vecinos, d...
5833 NaN
5834 NaN
notes \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
5830 If you are interested in hosting an even large...
5831 NaN
5832 Paisaje y tranquilidad.
5833 NaN
5834 The security deposit may be forfeited in the e...
transit ... \
0 Unfortunately there is no convenient public tr... ...
1 Bike, Bus, Metrorail, etc. you name it we've g... ...
2 NaN ...
3 NaN ...
4 NaN ...
... ... ...
5830 In addition to the Airport Flyer that I alread... ...
5831 Buses leave from across the street (including ... ...
5832 En Coche ...
5833 NaN ...
5834 Close to grocery stores, restaurants and a mov... ...
number_of_reviews review_scores_rating review_scores_accuracy \
0 1 100.0 10.0
1 0 NaN NaN
2 0 NaN NaN
3 0 NaN NaN
4 0 NaN NaN
... ... ... ...
5830 9 100.0 10.0
5831 0 NaN NaN
5832 1 100.0 8.0
5833 0 NaN NaN
5834 0 NaN NaN
review_scores_cleanliness review_scores_checkin \
0 10.0 10.0
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
... ... ...
5830 10.0 10.0
5831 NaN NaN
5832 10.0 10.0
5833 NaN NaN
5834 NaN NaN
review_scores_communication review_scores_location review_scores_value \
0 10.0 10.0 10.0
1 NaN NaN NaN
2 NaN NaN NaN
3 NaN NaN NaN
4 NaN NaN NaN
... ... ... ...
5830 10.0 10.0 9.0
5831 NaN NaN NaN
5832 10.0 10.0 8.0
5833 NaN NaN NaN
5834 NaN NaN NaN
instant_bookable cancellation_policy
0 f moderate
1 f moderate
2 f flexible
3 f flexible
4 f strict
... ... ...
5830 f strict
5831 f moderate
5832 t strict
5833 f flexible
5834 f strict
[5835 rows x 54 columns]
id 0.000000
listing_url 0.000000
name 0.000000
summary 7.917738
space 23.307626
description 0.051414
experiences_offered 0.000000
neighborhood_overview 38.783205
notes 58.663239
transit 40.154242
host_id 0.000000
host_name 0.257069
host_since 0.257069
host_location 0.428449
host_about 31.893745
host_response_time 28.414739
host_response_rate 28.414739
host_is_superhost 0.257069
host_listings_count 0.257069
host_has_profile_pic 0.257069
host_identity_verified 0.257069
neighbourhood 17.737789
city 0.000000
property_type 0.000000
room_type 0.000000
accommodates 0.000000
bathrooms 0.788346
bedrooms 0.102828
beds 0.394173
bed_type 0.000000
amenities 0.000000
square_feet 94.824336
price 0.000000
weekly_price 61.833762
security_deposit 52.527849
cleaning_fee 38.526135
guests_included 0.000000
extra_people 0.000000
minimum_nights 0.000000
has_availability 0.000000
availability_30 0.000000
availability_60 0.000000
availability_90 0.000000
availability_365 0.000000
number_of_reviews 0.000000
review_scores_rating 35.064267
review_scores_accuracy 35.287061
review_scores_cleanliness 35.252785
review_scores_checkin 35.252785
review_scores_communication 35.252785
review_scores_location 35.235647
review_scores_value 35.252785
instant_bookable 0.000000
cancellation_policy 0.000000
dtype: float64
# Columns that need dollar sign removal
columns_with_symbols = ['price', 'weekly_price', 'security_deposit', 'cleaning_fee', 'extra_people', 'host_response_rate']
# Iterate through each column
for col in columns_with_symbols:
# Remove dollar signs, commas, and percent signs, and convert to numeric
df[col] = df[col].replace({'\$': '', ',': '', '%': ''}, regex=True).astype(float)
cats = ['host_name', 'host_since', 'host_location','summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'host_about', 'neighbourhood', 'bedrooms', 'bathrooms']
for col in cats:
df.dropna(subset=[col], inplace=True)
dropping because the null percentage was close to none
df.columns
Index(['id', 'listing_url', 'name', 'summary', 'space', 'description',
'experiences_offered', 'neighborhood_overview', 'notes', 'transit',
'host_id', 'host_name', 'host_since', 'host_location', 'host_about',
'host_response_time', 'host_response_rate', 'host_is_superhost',
'host_listings_count', 'host_has_profile_pic', 'host_identity_verified',
'neighbourhood', 'city', 'property_type', 'room_type', 'accommodates',
'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet',
'price', 'weekly_price', 'security_deposit', 'cleaning_fee',
'guests_included', 'extra_people', 'minimum_nights', 'has_availability',
'availability_30', 'availability_60', 'availability_90',
'availability_365', 'number_of_reviews', 'review_scores_rating',
'review_scores_accuracy', 'review_scores_cleanliness',
'review_scores_checkin', 'review_scores_communication',
'review_scores_location', 'review_scores_value', 'instant_bookable',
'cancellation_policy'],
dtype='object')
df['has_availability'] = df['has_availability'].apply(lambda x: 1 if x == 't' else 0)
df['instant_bookable'] = df['instant_bookable'].apply(lambda x: 1 if x == 't' else 0)
df['host_is_superhost'] = df['host_is_superhost'].apply(lambda x: 1 if x == 't' else 0)
df['host_identity_verified'] = df['host_identity_verified'].apply(lambda x: 1 if x == 't' else 0)
df['host_has_profile_pic'] = df['host_has_profile_pic'].apply(lambda x: 1 if x == 't' else 0)
imp = ['host_response_rate','host_listings_count', 'beds', 'square_feet', 'price', 'weekly_price','security_deposit', 'cleaning_fee', 'guests_included', 'extra_people','minimum_nights','availability_30', 'availability_60', 'availability_90', 'availability_365','number_of_reviews', 'review_scores_rating', 'review_scores_accuracy','review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location','review_scores_value']
for i in imp:
df[i] = pd.to_numeric(df[i], errors='coerce')
for col in imp:
# Calculate mean for each column
mean_value = df[col].mean()
# Fill NaN values with the mean
df[col].fillna(value=mean_value, inplace=True)
df.head()
| id | listing_url | name | summary | space | description | experiences_offered | neighborhood_overview | notes | transit | ... | number_of_reviews | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | cancellation_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 44 | 5606440 | https://www.airbnb.com/rooms/5606440 | LAKEFRONT W DOCK CLOSE TO THE OASIS | Large bdrm with full bath directly across the ... | Situated on a hill on beautiful Lake Travis. W... | Large bdrm with full bath directly across the ... | none | *Close enough to downtown but away from the ch... | We call our home "Club Indigo". We have 3 room... | There are no bus stops out by the lake. We can... | ... | 9 | 100.00000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.00 | 10.000000 | 1 | strict |
| 56 | 4704597 | https://www.airbnb.com/rooms/4704597 | Hillcountry Reatreat Lake Austin | Located in the most desirable Texas Hill Count... | This home is walks from the lake and all the f... | Located in the most desirable Texas Hill Count... | none | The area is like a dream serene setting in the... | The home is fully furnished with brand new mem... | There is no public transport you will need a c... | ... | 4 | 100.00000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.00 | 10.000000 | 0 | strict |
| 58 | 951773 | https://www.airbnb.com/rooms/951773 | WATERFRONT- STEINER RANCH/LAKEWAY | The lake is full! Come stay ON LAKE TRAVIS! Wa... | Situated on a hill on beautiful Lake Travis. W... | The lake is full! Come stay ON LAKE TRAVIS! Wa... | none | *Close enough to downtown but away from the ch... | We call our home "Club Indigo". We have 3 room... | There are no bus stops out by the lake. We can... | ... | 10 | 100.00000 | 10.000000 | 10.000000 | 10.00000 | 10.000000 | 10.00 | 10.000000 | 0 | strict |
| 59 | 8268970 | https://www.airbnb.com/rooms/8268970 | Austin Casita SW area | This Casita is in an upscale area of SW Austi... | Our location is private, yet close to city and... | This Casita is in an upscale area of SW Austi... | none | Our neighborhood is about 35 private homes on ... | This is our back house to our home. We have t... | If available we will Uber for a discounted rat... | ... | 0 | 96.13531 | 9.700465 | 9.594419 | 9.88093 | 9.893855 | 9.52 | 9.481378 | 1 | flexible |
| 81 | 4404358 | https://www.airbnb.com/rooms/4404358 | Cozy centrally located couch | Welcome to the dopest pad in ATX! This spaciou... | We have a little creek outside our apartment a... | Welcome to the dopest pad in ATX! This spaciou... | none | We live in the North loop area which is the mo... | My roommate and I are pretty laid-back people,... | There's free parking on-site, and if you wante... | ... | 2 | 80.00000 | 9.000000 | 9.000000 | 10.00000 | 9.000000 | 9.00 | 9.000000 | 0 | moderate |
5 rows × 54 columns
Fit full regression
import pandas as pd
def extract_numeric_features(df):
numeric_features = df.select_dtypes(include=['number']) # Select columns with numeric data types
return numeric_features
# Assuming df_filtered is your DataFrame
numeric_df = extract_numeric_features(df)
#create for loop to produce graph for each numeric feat
for feature in numeric_df:
# Set the figure size
plt.figure(figsize=(10, 3))
# Create the histogram using seaborn
sns.histplot(df,x=feature, bins=30, kde=True, color='skyblue')
# Set the title and x-label
plt.title(f'Histogram of {feature}')
plt.xlabel(feature)
# Display the plot
plt.show()
#understand relationship between features and target var
for feature in numeric_df:
if feature != 'price': # not include target var
plt.figure(figsize=(10, 3))
sns.scatterplot(df,x=feature, y='price')
# if you want a linear line
# sns.regplot(wake,x=feature, y='sale_price', scatter_kws={'s': 50, 'color': 'blue'}, line_kws={'color': 'red'})
plt.title(f'Scatter Plot of price = {feature} ')
plt.xlabel(feature)
plt.ylabel('price')
# Display the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate correlations between 'price' and other variables
price_corr = numeric_df.corr()['price'].sort_values(ascending=False)
# Select top correlated features (adjust the number as needed)
top_corr_features = price_corr.head(10) # For example, considering top 10 correlated features
# Create a new DataFrame with top correlated features
df_top_corr = df[top_corr_features.index]
# Calculate correlations between selected features
df_correlations = df_top_corr.corr()
# Plot correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_correlations, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap against Price')
plt.show()
import pandas as pd
# Assuming 'df' is your DataFrame containing Airbnb rental data
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
for column in categorical_columns:
unique_values_count = df[column].nunique()
print(f"Column '{column}' has {unique_values_count} unique values.")
Column 'listing_url' has 1307 unique values. Column 'name' has 1301 unique values. Column 'summary' has 1275 unique values. Column 'space' has 1294 unique values. Column 'description' has 1302 unique values. Column 'experiences_offered' has 1 unique values. Column 'neighborhood_overview' has 1217 unique values. Column 'notes' has 1215 unique values. Column 'transit' has 1225 unique values. Column 'host_name' has 660 unique values. Column 'host_since' has 758 unique values. Column 'host_location' has 41 unique values. Column 'host_about' has 1040 unique values. Column 'host_response_time' has 4 unique values. Column 'neighbourhood' has 70 unique values. Column 'city' has 3 unique values. Column 'property_type' has 16 unique values. Column 'room_type' has 3 unique values. Column 'bed_type' has 5 unique values. Column 'amenities' has 1142 unique values. Column 'cancellation_policy' has 4 unique values.
# Your list of columns to consider for prediction
pred_price = [ 'price',
'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
'square_feet', 'host_response_time', 'host_response_rate', 'host_is_superhost',
'neighbourhood', 'city', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'security_deposit', 'cleaning_fee', 'extra_people',
'cancellation_policy', 'minimum_nights', 'availability_30', 'availability_60',
'availability_90', 'availability_365'
]
# Filter out object-type columns
numerical_columns = df[pred_price].select_dtypes(exclude=['object'])
# Display the numerical columns for analysis
print(numerical_columns)
price accommodates bathrooms bedrooms beds square_feet \
44 85.0 2 1.0 1.0 1.0 1146.732558
56 599.0 16 3.0 5.0 11.0 1146.732558
58 85.0 2 1.0 1.0 1.0 1146.732558
59 110.0 2 1.0 1.0 1.0 1146.732558
81 75.0 1 1.0 1.0 2.0 1146.732558
... ... ... ... ... ... ...
5810 99.0 2 1.0 1.0 1.0 1146.732558
5815 49.0 1 1.0 1.0 2.0 1146.732558
5819 185.0 4 1.0 1.0 1.0 1146.732558
5827 500.0 8 3.0 4.0 4.0 1146.732558
5830 179.0 4 1.0 1.0 2.0 1146.732558
host_response_rate host_is_superhost review_scores_rating \
44 100.000000 1 100.00000
56 100.000000 0 100.00000
58 100.000000 1 100.00000
59 100.000000 0 96.13531
81 95.265471 0 80.00000
... ... ... ...
5810 86.000000 0 91.00000
5815 86.000000 0 93.00000
5819 100.000000 1 94.00000
5827 87.000000 0 94.00000
5830 100.000000 1 100.00000
review_scores_accuracy review_scores_cleanliness security_deposit \
44 10.000000 10.000000 100.000000
56 10.000000 10.000000 366.686732
58 10.000000 10.000000 100.000000
59 9.700465 9.594419 366.686732
81 9.000000 9.000000 366.686732
... ... ... ...
5810 9.000000 9.000000 95.000000
5815 9.000000 10.000000 95.000000
5819 10.000000 9.000000 150.000000
5827 9.000000 9.000000 500.000000
5830 10.000000 10.000000 500.000000
cleaning_fee extra_people minimum_nights availability_30 \
44 25.000000 25.0 2 25
56 300.000000 35.0 2 26
58 25.000000 25.0 2 26
59 70.729424 0.0 1 17
81 5.000000 0.0 1 28
... ... ... ... ...
5810 25.000000 25.0 3 17
5815 25.000000 49.0 3 24
5819 70.000000 10.0 1 21
5827 180.000000 0.0 2 19
5830 125.000000 50.0 1 15
availability_60 availability_90 availability_365
44 42 72 341
56 56 86 361
58 56 86 361
59 47 77 352
81 58 88 88
... ... ... ...
5810 47 77 78
5815 54 84 84
5819 51 81 81
5827 49 76 343
5830 45 65 340
[1307 rows x 19 columns]
# Fit full regression
import statsmodels.api as sm
X = numerical_columns.drop(columns=['price'])
y = numerical_columns['price']
X = sm.add_constant(X)
fullreg = sm.OLS(y, X).fit()
# Display regression summary
print(fullreg.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.504
Model: OLS Adj. R-squared: 0.497
Method: Least Squares F-statistic: 72.75
Date: Tue, 12 Dec 2023 Prob (F-statistic): 2.15e-181
Time: 15:13:40 Log-Likelihood: -8665.7
No. Observations: 1307 AIC: 1.737e+04
Df Residuals: 1288 BIC: 1.747e+04
Df Model: 18
Covariance Type: nonrobust
=============================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------
const -456.1365 126.614 -3.603 0.000 -704.530 -207.743
accommodates 8.9360 3.852 2.320 0.021 1.378 16.494
bathrooms 91.7108 12.051 7.610 0.000 68.070 115.352
bedrooms 33.8172 9.371 3.609 0.000 15.432 52.202
beds -7.4885 6.256 -1.197 0.232 -19.762 4.785
square_feet 0.0932 0.027 3.493 0.000 0.041 0.146
host_response_rate 0.2735 0.500 0.547 0.584 -0.707 1.254
host_is_superhost -32.2639 12.216 -2.641 0.008 -56.230 -8.298
review_scores_rating 5.0531 1.498 3.374 0.001 2.115 7.991
review_scores_accuracy -26.3594 11.955 -2.205 0.028 -49.812 -2.907
review_scores_cleanliness -5.2379 9.884 -0.530 0.596 -24.628 14.152
security_deposit 0.1758 0.021 8.294 0.000 0.134 0.217
cleaning_fee 1.1637 0.155 7.524 0.000 0.860 1.467
extra_people 0.3612 0.151 2.398 0.017 0.066 0.657
minimum_nights 1.3601 1.757 0.774 0.439 -2.086 4.806
availability_30 2.4161 1.338 1.806 0.071 -0.209 5.041
availability_60 1.4452 1.291 1.119 0.263 -1.088 3.978
availability_90 -2.1338 0.678 -3.149 0.002 -3.463 -0.804
availability_365 0.1687 0.057 2.982 0.003 0.058 0.280
==============================================================================
Omnibus: 1161.545 Durbin-Watson: 1.980
Prob(Omnibus): 0.000 Jarque-Bera (JB): 61592.794
Skew: 3.880 Prob(JB): 0.00
Kurtosis: 35.723 Cond. No. 3.13e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.13e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
vifres = pd.DataFrame()
vifres["Variable"] = X.columns
vifres["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifres)
Variable VIF 0 const 614.393013 1 accommodates 4.434816 2 bathrooms 2.534495 3 bedrooms 4.017986 4 beds 4.058308 5 square_feet 1.075079 6 host_response_rate 1.047838 7 host_is_superhost 1.117366 8 review_scores_rating 1.891659 9 review_scores_accuracy 1.741378 10 review_scores_cleanliness 1.836615 11 security_deposit 1.315338 12 cleaning_fee 1.880256 13 extra_people 1.074484 14 minimum_nights 1.015268 15 availability_30 8.595237 16 availability_60 29.047504 17 availability_90 17.153156 18 availability_365 1.614315
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame of numeric features
y - vector, series of the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Example Call: stepwise_selection(X, y)
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print(f'Add {best_feature} with p-value {best_pval:.4f}')
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.idxmax()
included.remove(worst_feature)
if verbose:
print(f'Drop {worst_feature} with p-value {worst_pval:.4f}')
if not changed:
break
return included
selected_features = stepwise_selection(X, y)
print('resulting features:')
print(selected_features)
Add const with p-value 0.0000 Add bathrooms with p-value 0.0000 Add cleaning_fee with p-value 0.0000 Add security_deposit with p-value 0.0000 Add bedrooms with p-value 0.0000 Add square_feet with p-value 0.0009 resulting features: ['const', 'bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']
# Fit stepwise regression
X = numerical_columns[['bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']]
X = sm.add_constant(X)
stepreg = sm.OLS(y, X).fit()
# Display regression summary
print(stepreg.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.484
Model: OLS Adj. R-squared: 0.482
Method: Least Squares F-statistic: 243.6
Date: Tue, 12 Dec 2023 Prob (F-statistic): 8.78e-184
Time: 15:14:21 Log-Likelihood: -8692.2
No. Observations: 1307 AIC: 1.740e+04
Df Residuals: 1301 BIC: 1.743e+04
Df Model: 5
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
const -240.1393 30.882 -7.776 0.000 -300.723 -179.556
bathrooms 93.9019 12.087 7.769 0.000 70.190 117.613
cleaning_fee 1.2744 0.153 8.315 0.000 0.974 1.575
security_deposit 0.1824 0.021 8.550 0.000 0.141 0.224
bedrooms 43.6069 7.698 5.665 0.000 28.505 58.709
square_feet 0.0901 0.027 3.342 0.001 0.037 0.143
==============================================================================
Omnibus: 1146.638 Durbin-Watson: 1.988
Prob(Omnibus): 0.000 Jarque-Bera (JB): 56111.444
Skew: 3.831 Prob(JB): 0.00
Kurtosis: 34.171 Cond. No. 7.30e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.3e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
vifresstep = pd.DataFrame()
vifresstep["Variable"] = X.columns
vifresstep["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifresstep)
Variable VIF 0 const 35.447970 1 bathrooms 2.472837 2 cleaning_fee 1.790782 3 security_deposit 1.291727 4 bedrooms 2.629538 5 square_feet 1.063576
#resid
residuals = stepreg.resid
#Q-Q Plot
fig = sm.qqplot(residuals,fit=True, line='45')
plt.show()
# resid vs fitted
plt.figure(figsize=(10, 6))
plt.scatter(stepreg.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--') # Add a horizontal line at y=0
plt.grid(True)
plt.show()
#trying a log transformation
X = numerical_columns[['bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']]
X = sm.add_constant(X)
logy = np.sqrt(y)
logreg = sm.OLS(logy, X).fit()
# Display regression summary
print(logreg.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.560
Model: OLS Adj. R-squared: 0.558
Method: Least Squares F-statistic: 330.9
Date: Tue, 12 Dec 2023 Prob (F-statistic): 7.90e-229
Time: 15:14:27 Log-Likelihood: -3739.9
No. Observations: 1307 AIC: 7492.
Df Residuals: 1301 BIC: 7523.
Df Model: 5
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
const 2.8384 0.698 4.064 0.000 1.468 4.209
bathrooms 1.9804 0.273 7.245 0.000 1.444 2.517
cleaning_fee 0.0322 0.003 9.290 0.000 0.025 0.039
security_deposit 0.0030 0.000 6.304 0.000 0.002 0.004
bedrooms 1.8859 0.174 10.832 0.000 1.544 2.227
square_feet 0.0013 0.001 2.060 0.040 6.02e-05 0.002
==============================================================================
Omnibus: 477.581 Durbin-Watson: 1.898
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2985.815
Skew: 1.558 Prob(JB): 0.00
Kurtosis: 9.717 Cond. No. 7.30e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.3e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
# recalc resid
residuals = logreg.resid
fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(logreg.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--') # Add a horizontal line at y=0
plt.grid(True)
plt.show()
object_columns = df[pred_price].select_dtypes(include=['object'])
# Create box plots for object-type columns against 'price'
for column in object_columns.columns:
plt.figure(figsize=(8, 6))
sns.boxplot(x=column, y='price', data=df)
plt.title(f'Boxplot of {column} against Price')
plt.xticks(rotation=45)
plt.xlabel(column)
plt.ylabel('Price')
plt.show()
#trying a log transformation
# Filter out object-type columns and create dummy variables
object_columns = df[pred_price].select_dtypes(include=['object'])
dummy_columns = pd.get_dummies(object_columns, drop_first=True) # Use drop_first to avoid multicollinearity
# Select numerical columns
n = numerical_columns[['bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']]
# Concatenate numerical columns with dummy variables
temp = pd.concat([n, dummy_columns], axis=1)
X = sm.add_constant(temp)
logy = np.sqrt(y)
logreg = sm.OLS(logy, X).fit()
# Display regression summary
print(logreg.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.700
Model: OLS Adj. R-squared: 0.676
Method: Least Squares F-statistic: 28.78
Date: Tue, 12 Dec 2023 Prob (F-statistic): 7.35e-251
Time: 15:14:34 Log-Likelihood: -3489.1
No. Observations: 1307 AIC: 7176.
Df Residuals: 1208 BIC: 7689.
Df Model: 98
Covariance Type: nonrobust
============================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------------
const 6.3114 1.115 5.662 0.000 4.124 8.498
bathrooms 2.1495 0.248 8.656 0.000 1.662 2.637
cleaning_fee 0.0193 0.003 5.960 0.000 0.013 0.026
security_deposit 0.0028 0.000 6.534 0.000 0.002 0.004
bedrooms 1.5192 0.171 8.888 0.000 1.184 1.855
square_feet 0.0013 0.001 2.404 0.016 0.000 0.002
property_type_Bed & Breakfast 2.3648 2.648 0.893 0.372 -2.830 7.560
property_type_Boat -2.3787 3.666 -0.649 0.517 -9.571 4.813
property_type_Bungalow 0.8287 2.231 0.371 0.710 -3.549 5.206
property_type_Cabin 0.8140 1.444 0.564 0.573 -2.019 3.647
property_type_Camper/RV -0.9860 1.358 -0.726 0.468 -3.650 1.678
property_type_Chalet 2.1302 3.676 0.580 0.562 -5.081 9.342
property_type_Condominium -0.6283 0.869 -0.723 0.470 -2.334 1.077
property_type_Earth House -0.0326 3.690 -0.009 0.993 -7.272 7.207
property_type_House 0.6926 0.306 2.262 0.024 0.092 1.293
property_type_Loft 1.0908 0.749 1.456 0.146 -0.379 2.561
property_type_Other -0.1341 1.148 -0.117 0.907 -2.387 2.118
property_type_Tent -3.8704 3.724 -1.039 0.299 -11.176 3.436
property_type_Tipi -0.3563 3.774 -0.094 0.925 -7.761 7.048
property_type_Townhouse -0.7478 1.129 -0.663 0.508 -2.962 1.467
property_type_Villa -1.2836 3.692 -0.348 0.728 -8.528 5.960
room_type_Private room -2.5467 0.294 -8.664 0.000 -3.123 -1.970
room_type_Shared room -4.4068 0.846 -5.207 0.000 -6.067 -2.747
host_response_time_within a day -3.6206 0.409 -8.858 0.000 -4.422 -2.819
host_response_time_within a few hours -4.0718 0.344 -11.835 0.000 -4.747 -3.397
host_response_time_within an hour -4.2225 0.316 -13.345 0.000 -4.843 -3.602
neighbourhood_Anderson Mill 0.2248 1.722 0.131 0.896 -3.153 3.602
neighbourhood_Balcones Civic Association 3.2489 2.024 1.605 0.109 -0.723 7.221
neighbourhood_Barton Creek 2.9882 2.727 1.096 0.273 -2.362 8.338
neighbourhood_Barton Hills 2.3778 1.021 2.329 0.020 0.375 4.381
neighbourhood_Bouldin Creek 2.7877 1.038 2.685 0.007 0.751 4.825
neighbourhood_Brentwood 0.7645 1.157 0.661 0.509 -1.505 3.034
neighbourhood_Bryker Woods 3.0245 2.285 1.324 0.186 -1.458 7.507
neighbourhood_Bull Creek -0.0473 1.635 -0.029 0.977 -3.255 3.160
neighbourhood_Cherry Creek -0.4194 1.062 -0.395 0.693 -2.504 1.665
neighbourhood_Cherrywood -6.4792 3.832 -1.691 0.091 -13.997 1.038
neighbourhood_Clarksville 3.0547 1.147 2.663 0.008 0.804 5.305
neighbourhood_Copperfield -2.9738 3.755 -0.792 0.429 -10.341 4.393
neighbourhood_Crestview -1.3549 1.503 -0.901 0.368 -4.304 1.595
neighbourhood_Dawson 1.6165 1.043 1.549 0.122 -0.431 3.664
neighbourhood_Downtown 3.4356 1.047 3.282 0.001 1.382 5.489
neighbourhood_East Congress -0.0994 1.452 -0.068 0.945 -2.949 2.750
neighbourhood_East Downtown 3.2923 0.919 3.583 0.000 1.489 5.095
neighbourhood_East Riverside 0.4549 1.046 0.435 0.664 -1.597 2.507
neighbourhood_Galindo 0.4761 1.156 0.412 0.681 -1.792 2.744
neighbourhood_Govalle 1.8489 1.102 1.677 0.094 -0.314 4.011
neighbourhood_Gracywoods -1.6309 2.302 -0.708 0.479 -6.147 2.886
neighbourhood_Hancock 0.1647 1.120 0.147 0.883 -2.032 2.361
neighbourhood_Highland 0.8177 1.330 0.615 0.539 -1.792 3.427
neighbourhood_Holly 2.3920 1.017 2.353 0.019 0.397 4.387
neighbourhood_Hyde Park 1.2182 1.155 1.055 0.292 -1.048 3.484
neighbourhood_Lamplight Village 0.5197 2.286 0.227 0.820 -3.965 5.005
neighbourhood_Long Canyon 4.1144 2.029 2.027 0.043 0.133 8.096
neighbourhood_MLK & 183 -0.2457 1.154 -0.213 0.831 -2.509 2.018
neighbourhood_McKinney 0.2733 1.288 0.212 0.832 -2.253 2.800
neighbourhood_Milwood -1.0132 2.723 -0.372 0.710 -6.355 4.328
neighbourhood_Montopolis -2.3505 1.842 -1.276 0.202 -5.965 1.264
neighbourhood_Mueller 0.4598 1.264 0.364 0.716 -2.020 2.939
neighbourhood_North Loop 1.9775 1.284 1.540 0.124 -0.541 4.496
neighbourhood_North Shoal Creek 0.0873 2.016 0.043 0.965 -3.868 4.043
neighbourhood_Northwest Hills -0.2856 1.371 -0.208 0.835 -2.975 2.404
neighbourhood_Oak Hill 0.4319 1.770 0.244 0.807 -3.040 3.904
neighbourhood_Old Enfield 3.9542 2.028 1.950 0.051 -0.025 7.933
neighbourhood_Old West Austin 4.0740 1.065 3.825 0.000 1.985 6.163
neighbourhood_Parker Lane 0.0921 1.206 0.076 0.939 -2.273 2.457
neighbourhood_Pecan Spings 0.6457 1.646 0.392 0.695 -2.584 3.876
neighbourhood_Pleasant Valley 1.9377 1.634 1.186 0.236 -1.268 5.144
neighbourhood_Rainey Street 3.5942 3.838 0.936 0.349 -3.936 11.125
neighbourhood_Rollingwood 6.9974 2.955 2.368 0.018 1.201 12.794
neighbourhood_Rosedale 2.7050 1.239 2.183 0.029 0.273 5.137
neighbourhood_Rosewood -0.0064 1.064 -0.006 0.995 -2.093 2.081
neighbourhood_SW Williamson Co. -1.0566 2.286 -0.462 0.644 -5.542 3.429
neighbourhood_South Congress 3.4130 1.159 2.945 0.003 1.139 5.687
neighbourhood_South First 3.3920 1.075 3.154 0.002 1.282 5.502
neighbourhood_South Lamar 2.0646 1.033 1.999 0.046 0.039 4.090
neighbourhood_South Manchaca 0.8765 1.179 0.743 0.457 -1.437 3.190
neighbourhood_St. Edwards 1.0860 1.263 0.860 0.390 -1.392 3.564
neighbourhood_St. Johns 0.2640 2.744 0.096 0.923 -5.120 5.648
neighbourhood_Steiner Ranch 2.2251 2.284 0.974 0.330 -2.256 6.706
neighbourhood_Sunset Valley 1.4631 1.011 1.447 0.148 -0.521 3.448
neighbourhood_Tarrytown 2.3959 1.232 1.945 0.052 -0.021 4.813
neighbourhood_Travis Heights 2.8197 0.966 2.919 0.004 0.924 4.715
neighbourhood_University Hills -0.1766 2.726 -0.065 0.948 -5.526 5.172
neighbourhood_University of Texas 1.6276 1.268 1.283 0.200 -0.861 4.116
neighbourhood_Upper Boggy Creek 1.5244 1.176 1.296 0.195 -0.783 3.832
neighbourhood_Walnut Creek 2.8406 3.744 0.759 0.448 -4.504 10.185
neighbourhood_West Austin -0.8102 2.721 -0.298 0.766 -6.148 4.528
neighbourhood_West Campus 3.1710 1.305 2.431 0.015 0.611 5.730
neighbourhood_West Congress -0.7815 1.554 -0.503 0.615 -3.830 2.267
neighbourhood_Westgate -0.3043 2.284 -0.133 0.894 -4.785 4.177
neighbourhood_Westlake Hills 2.0257 1.681 1.205 0.228 -1.273 5.324
neighbourhood_Windsor Hills -1.7916 3.740 -0.479 0.632 -9.130 5.546
neighbourhood_Windsor Park -0.1799 1.280 -0.141 0.888 -2.690 2.331
neighbourhood_Wooten -1.6602 2.051 -0.809 0.419 -5.685 2.365
neighbourhood_Zilker 3.0918 0.967 3.198 0.001 1.195 4.989
city_Sunset Valley 1.4631 1.011 1.447 0.148 -0.521 3.448
city_West Lake Hills -3.4568 2.309 -1.497 0.135 -7.987 1.074
cancellation_policy_moderate -0.0754 0.311 -0.243 0.808 -0.685 0.534
cancellation_policy_strict -0.3984 0.296 -1.346 0.178 -0.979 0.182
cancellation_policy_super_strict_30 14.8458 2.655 5.592 0.000 9.637 20.055
==============================================================================
Omnibus: 361.904 Durbin-Watson: 2.021
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2338.457
Skew: 1.121 Prob(JB): 0.00
Kurtosis: 9.158 Cond. No. 2.00e+19
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.92e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
selected_features = stepwise_selection(X, logy)
print('resulting features:')
print(selected_features)
Add const with p-value 0.0000 Add bedrooms with p-value 0.0000 Add cleaning_fee with p-value 0.0000 Add room_type_Private room with p-value 0.0000 Add bathrooms with p-value 0.0000 Add security_deposit with p-value 0.0000 Add host_response_time_within an hour with p-value 0.0000 Add host_response_time_within a few hours with p-value 0.0000 Add host_response_time_within a day with p-value 0.0000 Add room_type_Shared room with p-value 0.0000 Add cancellation_policy_super_strict_30 with p-value 0.0000 Add neighbourhood_East Downtown with p-value 0.0000 Add neighbourhood_Old West Austin with p-value 0.0001 Add neighbourhood_Zilker with p-value 0.0012 Add neighbourhood_South First with p-value 0.0044 Add neighbourhood_Travis Heights with p-value 0.0036 Add neighbourhood_Downtown with p-value 0.0018 Add square_feet with p-value 0.0049 Add neighbourhood_Bouldin Creek with p-value 0.0044 Add neighbourhood_South Congress with p-value 0.0082 Add neighbourhood_Holly with p-value 0.0086 resulting features: ['const', 'bedrooms', 'cleaning_fee', 'room_type_Private room', 'bathrooms', 'security_deposit', 'host_response_time_within an hour', 'host_response_time_within a few hours', 'host_response_time_within a day', 'room_type_Shared room', 'cancellation_policy_super_strict_30', 'neighbourhood_East Downtown', 'neighbourhood_Old West Austin', 'neighbourhood_Zilker', 'neighbourhood_South First', 'neighbourhood_Travis Heights', 'neighbourhood_Downtown', 'square_feet', 'neighbourhood_Bouldin Creek', 'neighbourhood_South Congress', 'neighbourhood_Holly']
#trying a log transformation
X = temp[['bedrooms', 'cleaning_fee', 'room_type_Private room', 'bathrooms', 'security_deposit', 'host_response_time_within an hour', 'host_response_time_within a few hours', 'host_response_time_within a day', 'room_type_Shared room', 'cancellation_policy_super_strict_30', 'neighbourhood_East Downtown', 'neighbourhood_Old West Austin', 'neighbourhood_Zilker', 'neighbourhood_South First', 'neighbourhood_Travis Heights', 'neighbourhood_Downtown', 'square_feet', 'neighbourhood_Bouldin Creek', 'neighbourhood_South Congress', 'neighbourhood_Holly']]
X = sm.add_constant(X)
logy = np.sqrt(y)
logreg = sm.OLS(logy, X).fit()
# Display regression summary
print(logreg.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.675
Model: OLS Adj. R-squared: 0.669
Method: Least Squares F-statistic: 133.3
Date: Tue, 12 Dec 2023 Prob (F-statistic): 5.48e-296
Time: 15:14:51 Log-Likelihood: -3542.6
No. Observations: 1307 AIC: 7127.
Df Residuals: 1286 BIC: 7236.
Df Model: 20
Covariance Type: nonrobust
=========================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------
const 7.1945 0.674 10.674 0.000 5.872 8.517
bedrooms 1.4738 0.158 9.334 0.000 1.164 1.784
cleaning_fee 0.0210 0.003 6.771 0.000 0.015 0.027
room_type_Private room -2.6253 0.257 -10.233 0.000 -3.129 -2.122
bathrooms 2.2529 0.238 9.472 0.000 1.786 2.720
security_deposit 0.0026 0.000 6.226 0.000 0.002 0.003
host_response_time_within an hour -4.1493 0.302 -13.726 0.000 -4.742 -3.556
host_response_time_within a few hours -3.8902 0.332 -11.733 0.000 -4.541 -3.240
host_response_time_within a day -3.4859 0.396 -8.807 0.000 -4.262 -2.709
room_type_Shared room -4.7189 0.792 -5.955 0.000 -6.273 -3.164
cancellation_policy_super_strict_30 16.6401 2.602 6.396 0.000 11.536 21.744
neighbourhood_East Downtown 2.3487 0.345 6.815 0.000 1.673 3.025
neighbourhood_Old West Austin 2.9280 0.632 4.629 0.000 1.687 4.169
neighbourhood_Zilker 2.0090 0.457 4.401 0.000 1.113 2.905
neighbourhood_South First 2.3657 0.654 3.615 0.000 1.082 3.650
neighbourhood_Travis Heights 1.6817 0.452 3.719 0.000 0.795 2.569
neighbourhood_Downtown 2.1049 0.569 3.697 0.000 0.988 3.222
square_feet 0.0015 0.001 2.774 0.006 0.000 0.003
neighbourhood_Bouldin Creek 1.8736 0.600 3.123 0.002 0.697 3.050
neighbourhood_South Congress 2.1718 0.782 2.778 0.006 0.638 3.705
neighbourhood_Holly 1.4566 0.553 2.632 0.009 0.371 2.542
==============================================================================
Omnibus: 349.384 Durbin-Watson: 1.993
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2031.623
Skew: 1.109 Prob(JB): 0.00
Kurtosis: 8.691 Cond. No. 3.14e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.14e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
# recalc resid
residuals = logreg.resid
fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(logreg.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--') # Add a horizontal line at y=0
plt.grid(True)
plt.show()
from IPython.core.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:90% }</style>"))
import warnings
warnings.filterwarnings('ignore')
# ------------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.feature_selection import SelectFromModel
# function to calculate perofrmance from sklearn logistic regression models
def performance_rpt(model, X_test, y_test):
# Calculate performance metrics using the provided model
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]
y_true = y_test
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_true, y_pred, pos_label=1)
recall = recall_score(y_true, y_pred, pos_label=1)
aucroc = roc_auc_score(y_test, y_proba)
# Print the performance metrics
#print(" -- Model Performance on Test Set --")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall : {recall:.4f}")
print(f"AUC-ROC : {aucroc:.4f}\n")
return accuracy, precision, recall, aucroc
# function for model coefficents from sklearn logistic regression
def rpt_model_variables(model):
# Get the intercept term
intercept = model.intercept_
# Access the coefficients (weights) of the model, i rounded them
coefficients = np.round(model.coef_[0],decimals=4)
# Create DataFrames for intercept and coefficients
#df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
df_coefficients = pd.DataFrame({'feature': X2.columns, 'coefficient': coefficients})
df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
# if you want to add intercept to table
#df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)
# Print the DataFrame
print(df_coefficients)
return df_coefficients
# plot variable importance for sklearn logistic regression
def plot_variable_imp(df_coef):
df_plt = df_coef[df_coef['abs_coefficient'] != 0]
reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
sns.barplot(data=df_plt,
y=df_plt['feature'],
x=df_plt['abs_coefficient'], color="lightblue")
plt.show()
print("-- rejected --")
for i in reject_vars:
print(f" {i}")
create pred_booked containing the columns I want to use to answer this question
# List of predictor variables for booking probability
pred_booked = df[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds','bed_type','price','security_deposit',
'cleaning_fee', 'number_of_reviews', 'review_scores_location',
'review_scores_value','square_feet', 'host_response_time', 'host_response_rate', 'host_is_superhost',
'neighbourhood', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'cancellation_policy', 'minimum_nights',
'availability_30', 'availability_60', 'availability_365'
]]
# Create a new column 'booked' based on 90-day availability less than 40%
pred_booked['booked'] = (df['availability_90'] < 0.4 * 90).astype(int)
pred_booked.columns
Index(['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee',
'number_of_reviews', 'review_scores_location', 'review_scores_value',
'square_feet', 'host_response_time', 'host_response_rate',
'host_is_superhost', 'neighbourhood', 'review_scores_rating',
'review_scores_accuracy', 'review_scores_cleanliness',
'cancellation_policy', 'minimum_nights', 'availability_30',
'availability_60', 'availability_365', 'booked'],
dtype='object')
import seaborn as sns
import matplotlib.pyplot as plt
columns_of_interest = ['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee',
'number_of_reviews','review_scores_location',
'review_scores_value', 'square_feet', 'host_response_time',
'host_response_rate', 'host_is_superhost', 'neighbourhood',
'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'cancellation_policy', 'minimum_nights',
'availability_30', 'availability_60', 'availability_365']
subset = pred_booked[columns_of_interest]
# Calculate the correlation matrix
correlation_matrix = subset.corr()
# Plotting the heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Variables')
plt.show()
for column in pred_booked.columns:
if pred_booked[column].dtype == 'object':
print(column)
property_type room_type bed_type host_response_time neighbourhood cancellation_policy
# split the data into training and testing sets
X3 = pred_booked.drop(['booked'], axis=1)
X3 = pred_booked[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee',
'number_of_reviews','review_scores_location',
'review_scores_value', 'square_feet', 'host_response_time',
'host_response_rate', 'host_is_superhost', 'neighbourhood',
'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'cancellation_policy', 'minimum_nights',
'availability_30', 'availability_60', 'availability_365']]
categorical_columns = ['property_type', 'room_type', 'bed_type', 'neighbourhood','host_response_time','cancellation_policy']
# Apply one-hot encoding to categorical columns
X3 = pd.get_dummies(X3, columns=categorical_columns, drop_first=True)
y3 = pred_booked['booked']
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=42)
X3_train.info()
y3_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 914 entries, 5414 to 4770 Columns: 116 entries, accommodates to cancellation_policy_super_strict_30 dtypes: float64(13), int64(7), uint8(96) memory usage: 235.6 KB <class 'pandas.core.series.Series'> Int64Index: 914 entries, 5414 to 4770 Series name: booked Non-Null Count Dtype -------------- ----- 914 non-null int64 dtypes: int64(1) memory usage: 14.3 KB
# Create an Instance of Logistic Regression for LASSO Selection using c = 0.1 and c = 0.01
lr_l1_1 = LogisticRegression(penalty='l1', solver='liblinear', C=.1)
lr_l1_01 = LogisticRegression(penalty='l1', solver='liblinear', C= .01)
# fit the models to the training data
lr_l1_1.fit(X3_train, y3_train)
lr_l1_01.fit(X3_train, y3_train)
# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2 = LogisticRegression(penalty='l2', C = .01, solver='liblinear')
# Create an instance of Logistic Regression for Elastic Net
lr_l12 = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga')
# fit the models to the training data
lr_l2.fit(X3_train, y3_train)
lr_l12.fit(X3_train, y3_train)
LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga')
# function for model coefficents
def rpt_model_variables(model):
# Get the intercept term
intercept = model.intercept_
# Access the coefficients (weights) of the model, i rounded them
coefficients = np.round(model.coef_[0],decimals=4)
# Create DataFrames for intercept and coefficients
#df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
df_coefficients = pd.DataFrame({'feature': X3.columns, 'coefficient': coefficients})
df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
# if you want to add intercept to table
#df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)
# Print the DataFrame
print(df_coefficients)
return df_coefficients
df_coefficients1 = rpt_model_variables(lr_l1_1)
df_coefficients01 = rpt_model_variables(lr_l1_01)
df_coefficients2 = rpt_model_variables(lr_l2)
df_coefficients12 = rpt_model_variables(lr_l12)
feature coefficient abs_coefficient
18 availability_60 -0.2195 0.2195
13 review_scores_rating 0.0739 0.0739
17 availability_30 0.0678 0.0678
16 minimum_nights 0.0145 0.0145
7 number_of_reviews -0.0127 0.0127
.. ... ... ...
42 neighbourhood_Balcones Civic Association 0.0000 0.0000
41 neighbourhood_Anderson Mill 0.0000 0.0000
40 bed_type_Real Bed 0.0000 0.0000
39 bed_type_Pull-out Sofa 0.0000 0.0000
115 cancellation_policy_super_strict_30 0.0000 0.0000
[116 rows x 3 columns]
feature coefficient abs_coefficient
18 availability_60 -0.1620 0.1620
13 review_scores_rating 0.0410 0.0410
7 number_of_reviews -0.0081 0.0081
19 availability_365 -0.0073 0.0073
11 host_response_rate 0.0031 0.0031
.. ... ... ...
41 neighbourhood_Anderson Mill 0.0000 0.0000
40 bed_type_Real Bed 0.0000 0.0000
39 bed_type_Pull-out Sofa 0.0000 0.0000
38 bed_type_Futon 0.0000 0.0000
115 cancellation_policy_super_strict_30 0.0000 0.0000
[116 rows x 3 columns]
feature coefficient abs_coefficient
18 availability_60 -0.2168 0.2168
17 availability_30 0.0665 0.0665
13 review_scores_rating 0.0610 0.0610
14 review_scores_accuracy 0.0562 0.0562
9 review_scores_value 0.0531 0.0531
.. ... ... ...
32 property_type_Tipi -0.0000 0.0000
100 neighbourhood_Walnut Creek -0.0000 0.0000
22 property_type_Bungalow -0.0000 0.0000
25 property_type_Chalet -0.0000 0.0000
43 neighbourhood_Barton Creek 0.0000 0.0000
[116 rows x 3 columns]
feature coefficient abs_coefficient
18 availability_60 -0.0382 0.0382
17 availability_30 -0.0164 0.0164
19 availability_365 -0.0077 0.0077
7 number_of_reviews -0.0072 0.0072
11 host_response_rate 0.0051 0.0051
.. ... ... ...
49 neighbourhood_Cherry Creek 0.0000 0.0000
48 neighbourhood_Bull Creek 0.0000 0.0000
47 neighbourhood_Bryker Woods 0.0000 0.0000
46 neighbourhood_Brentwood 0.0000 0.0000
115 cancellation_policy_super_strict_30 0.0000 0.0000
[116 rows x 3 columns]
# plot variable importance
def plot_variable_imp(df_coef):
df_plt = df_coef[df_coef['abs_coefficient'] != 0]
reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
sns.barplot(data=df_plt,
y=df_plt['feature'],
x=df_plt['abs_coefficient'], color="lightblue")
plt.show()
print("-- rejected --")
for i in reject_vars:
print(f" {i}")
plot_variable_imp(df_coefficients1)
plot_variable_imp(df_coefficients01)
plot_variable_imp(df_coefficients2)
plot_variable_imp(df_coefficients12)
-- rejected -- neighbourhood_Pleasant Valley neighbourhood_Old West Austin neighbourhood_Parker Lane neighbourhood_Pecan Spings neighbourhood_Rollingwood neighbourhood_Rainey Street neighbourhood_Oak Hill neighbourhood_Rosedale neighbourhood_Rosewood neighbourhood_SW Williamson Co. neighbourhood_South Congress neighbourhood_Old Enfield accommodates neighbourhood_Northwest Hills neighbourhood_North Shoal Creek neighbourhood_North Loop neighbourhood_South Lamar neighbourhood_Mueller neighbourhood_Montopolis neighbourhood_Milwood neighbourhood_McKinney neighbourhood_MLK & 183 neighbourhood_Long Canyon neighbourhood_Lamplight Village neighbourhood_Hyde Park neighbourhood_South First neighbourhood_Tarrytown neighbourhood_South Manchaca neighbourhood_St. Edwards cancellation_policy_strict cancellation_policy_moderate host_response_time_within an hour host_response_time_within a few hours host_response_time_within a day neighbourhood_Zilker neighbourhood_Wooten neighbourhood_Windsor Park neighbourhood_Windsor Hills neighbourhood_Westlake Hills neighbourhood_Westgate neighbourhood_West Congress neighbourhood_West Campus neighbourhood_West Austin neighbourhood_Walnut Creek neighbourhood_Upper Boggy Creek neighbourhood_University of Texas neighbourhood_University Hills neighbourhood_Travis Heights neighbourhood_Highland neighbourhood_Sunset Valley neighbourhood_Steiner Ranch neighbourhood_St. Johns neighbourhood_Holly neighbourhood_East Riverside neighbourhood_Hancock neighbourhood_Gracywoods property_type_Villa property_type_Townhouse property_type_Tipi property_type_Tent property_type_Other property_type_Loft property_type_House property_type_Earth House property_type_Condominium property_type_Chalet property_type_Camper/RV property_type_Cabin property_type_Bungalow property_type_Boat property_type_Bed & Breakfast review_scores_cleanliness review_scores_accuracy host_is_superhost host_response_rate review_scores_value review_scores_location beds bedrooms room_type_Private room room_type_Shared room bed_type_Couch neighbourhood_Cherrywood neighbourhood_Govalle neighbourhood_Galindo bathrooms neighbourhood_East Downtown neighbourhood_East Congress neighbourhood_Downtown neighbourhood_Dawson neighbourhood_Crestview neighbourhood_Copperfield neighbourhood_Clarksville neighbourhood_Cherry Creek bed_type_Futon neighbourhood_Bull Creek neighbourhood_Bryker Woods neighbourhood_Brentwood neighbourhood_Bouldin Creek neighbourhood_Barton Hills neighbourhood_Barton Creek neighbourhood_Balcones Civic Association neighbourhood_Anderson Mill bed_type_Real Bed bed_type_Pull-out Sofa cancellation_policy_super_strict_30
-- rejected -- neighbourhood_Rainey Street neighbourhood_Old West Austin neighbourhood_Parker Lane neighbourhood_Pecan Spings neighbourhood_Pleasant Valley accommodates neighbourhood_Rollingwood neighbourhood_Oak Hill neighbourhood_Rosedale neighbourhood_Rosewood neighbourhood_SW Williamson Co. neighbourhood_South Congress neighbourhood_Old Enfield neighbourhood_North Loop neighbourhood_Northwest Hills neighbourhood_North Shoal Creek neighbourhood_South Lamar neighbourhood_Mueller neighbourhood_Montopolis neighbourhood_Milwood neighbourhood_McKinney neighbourhood_MLK & 183 neighbourhood_Long Canyon neighbourhood_Lamplight Village neighbourhood_Hyde Park neighbourhood_South First neighbourhood_Sunset Valley neighbourhood_South Manchaca neighbourhood_St. Edwards cancellation_policy_strict cancellation_policy_moderate host_response_time_within an hour host_response_time_within a few hours host_response_time_within a day neighbourhood_Zilker neighbourhood_Wooten neighbourhood_Windsor Park neighbourhood_Windsor Hills neighbourhood_Westlake Hills neighbourhood_Westgate neighbourhood_West Congress neighbourhood_West Campus neighbourhood_West Austin neighbourhood_Walnut Creek neighbourhood_Upper Boggy Creek neighbourhood_University of Texas neighbourhood_University Hills neighbourhood_Travis Heights neighbourhood_Tarrytown neighbourhood_Highland neighbourhood_Steiner Ranch neighbourhood_St. Johns neighbourhood_Holly neighbourhood_East Riverside neighbourhood_Hancock room_type_Private room property_type_Townhouse property_type_Tipi property_type_Tent property_type_Other property_type_Loft property_type_House property_type_Earth House property_type_Condominium property_type_Chalet property_type_Camper/RV property_type_Cabin property_type_Bungalow property_type_Boat property_type_Bed & Breakfast availability_30 minimum_nights review_scores_cleanliness review_scores_accuracy host_is_superhost review_scores_value review_scores_location beds bedrooms property_type_Villa room_type_Shared room neighbourhood_Gracywoods bed_type_Couch neighbourhood_Govalle neighbourhood_Galindo bathrooms neighbourhood_East Downtown neighbourhood_East Congress neighbourhood_Downtown neighbourhood_Dawson neighbourhood_Crestview neighbourhood_Copperfield neighbourhood_Clarksville neighbourhood_Cherrywood neighbourhood_Cherry Creek neighbourhood_Bull Creek neighbourhood_Bryker Woods neighbourhood_Brentwood neighbourhood_Bouldin Creek neighbourhood_Barton Hills neighbourhood_Barton Creek neighbourhood_Balcones Civic Association neighbourhood_Anderson Mill bed_type_Real Bed bed_type_Pull-out Sofa bed_type_Futon cancellation_policy_super_strict_30
-- rejected -- property_type_Villa neighbourhood_Bull Creek neighbourhood_Copperfield neighbourhood_Cherrywood property_type_Earth House property_type_Bed & Breakfast property_type_Boat neighbourhood_Rollingwood neighbourhood_Lamplight Village neighbourhood_Old Enfield neighbourhood_Rainey Street neighbourhood_St. Johns neighbourhood_Sunset Valley property_type_Tipi neighbourhood_Walnut Creek property_type_Bungalow property_type_Chalet neighbourhood_Barton Creek
-- rejected -- neighbourhood_Pleasant Valley neighbourhood_South Lamar neighbourhood_South First neighbourhood_South Congress neighbourhood_SW Williamson Co. neighbourhood_Rosewood neighbourhood_Rosedale neighbourhood_Rollingwood neighbourhood_Rainey Street neighbourhood_Pecan Spings neighbourhood_Milwood neighbourhood_Parker Lane neighbourhood_Old West Austin neighbourhood_Old Enfield neighbourhood_Oak Hill neighbourhood_Northwest Hills neighbourhood_North Shoal Creek neighbourhood_North Loop neighbourhood_Mueller neighbourhood_St. Edwards neighbourhood_Montopolis neighbourhood_South Manchaca neighbourhood_Walnut Creek neighbourhood_St. Johns neighbourhood_Westgate cancellation_policy_strict cancellation_policy_moderate host_response_time_within a few hours host_response_time_within a day neighbourhood_Zilker neighbourhood_Wooten neighbourhood_Windsor Park neighbourhood_Windsor Hills neighbourhood_Westlake Hills neighbourhood_West Congress neighbourhood_Steiner Ranch neighbourhood_West Campus neighbourhood_West Austin neighbourhood_MLK & 183 neighbourhood_Upper Boggy Creek neighbourhood_University of Texas neighbourhood_University Hills neighbourhood_Travis Heights neighbourhood_Tarrytown neighbourhood_Sunset Valley neighbourhood_McKinney neighbourhood_East Riverside neighbourhood_Long Canyon neighbourhood_Lamplight Village neighbourhood_Anderson Mill bed_type_Pull-out Sofa bed_type_Futon bed_type_Couch room_type_Shared room property_type_Villa property_type_Townhouse property_type_Tipi property_type_Tent property_type_Other property_type_Loft property_type_House property_type_Earth House property_type_Condominium property_type_Chalet property_type_Camper/RV property_type_Cabin property_type_Bungalow property_type_Boat property_type_Bed & Breakfast bedrooms neighbourhood_Balcones Civic Association neighbourhood_Barton Creek neighbourhood_Barton Hills neighbourhood_East Congress neighbourhood_Hyde Park neighbourhood_Holly neighbourhood_Highland neighbourhood_Hancock neighbourhood_Gracywoods neighbourhood_Govalle neighbourhood_Galindo bathrooms neighbourhood_East Downtown neighbourhood_Downtown neighbourhood_Bouldin Creek neighbourhood_Dawson neighbourhood_Crestview neighbourhood_Copperfield neighbourhood_Clarksville neighbourhood_Cherrywood neighbourhood_Cherry Creek neighbourhood_Bull Creek neighbourhood_Bryker Woods neighbourhood_Brentwood cancellation_policy_super_strict_30
# make predictions on the testing data
y_pred_train = lr_l1_1.predict(X3_train)
y_pred_test = lr_l1_1.predict(X3_test)
y_proba_train = lr_l1_1.predict_proba(X3_train)
y_proba_test = lr_l1_1.predict_proba(X3_test)
y_pred_train1 = lr_l1_01.predict(X3_train)
y_pred_test1 = lr_l1_01.predict(X3_test)
y_proba_train1 = lr_l1_01.predict_proba(X3_train)
y_proba_test1 = lr_l1_01.predict_proba(X3_test)
y_pred_train2 = lr_l2.predict(X3_train)
y_pred_test2 = lr_l2.predict(X3_test)
y_proba_train2 = lr_l2.predict_proba(X3_train)
y_proba_test2 = lr_l2.predict_proba(X3_test)
y_pred_train12 = lr_l12.predict(X3_train)
y_pred_test12 = lr_l12.predict(X3_test)
y_proba_train12 = lr_l12.predict_proba(X3_train)
y_proba_test12 = lr_l12.predict_proba(X3_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
acc3_train = accuracy_score(y3_train, y_pred_train)
prec3_train = precision_score(y3_train, y_pred_train)
rec3_train = recall_score(y3_train, y_pred_train)
auc3_train = roc_auc_score(y3_train, y_proba_train[:, 1])
# Print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train))
print("Precision: {:.4f}".format(prec3_train))
print("Recall : {:.4f}".format(rec3_train))
print("AUC : {:.4f}".format(auc3_train))
print("")
acc3_test = accuracy_score(y3_test, y_pred_test)
prec3_test = precision_score(y3_test, y_pred_test)
acc3_test = recall_score(y3_test, y_pred_test)
auc3_test = roc_auc_score(y3_test, lr_l1_1.predict_proba(X3_test)[:, 1])
print("Accuracy : {:.4f}".format(acc3_test))
print("Precision: {:.4f}".format(prec3_test))
print("Recall : {:.4f}".format(acc3_test))
print("AUC : {:.4f}".format(auc3_test))
-- train set -- Accuracy : 0.9409 Precision: 0.8863 Recall : 0.8618 AUC : 0.9885 Accuracy : 0.8774 Precision: 0.8774 Recall : 0.8774 AUC : 0.9864
# calculate the accuracy, precision, and recall scores
acc3_train1 = accuracy_score(y3_train, y_pred_train1)
prec3_train1 = precision_score(y3_train, y_pred_train1)
rec3_train1 = recall_score(y3_train, y_pred_train1)
auc3_train1 = roc_auc_score(y3_train, y_proba_train1[:,1])
# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train1))
print("Precision: {:.4f}".format(prec3_train1))
print("Recall. : {:.4f}".format(rec3_train1))
print("AUC : {:.4f}".format(auc3_train1))
print("")
# calculate the accuracy, precision, and recall scores
acc3_test1 = accuracy_score(y3_test, y_pred_test1)
prec3_test1 = precision_score(y3_test, y_pred_test1)
rec3_test1 = recall_score(y3_test, y_pred_test1)
auc3_test1 = roc_auc_score(y3_test, y_proba_test1[:,1])
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc3_test1))
print("Precision: {:.4f}".format(prec3_test1))
print("Recall. : {:.4f}".format(rec3_test1))
print("AUC : {:.4f}".format(auc3_test1))
-- train set -- Accuracy : 0.9431 Precision: 0.8873 Recall. : 0.8710 AUC : 0.9872 -- test set -- Accuracy : 0.9389 Precision: 0.8942 Recall. : 0.8774 AUC : 0.9867
# calculate the accuracy, precision, and recall scores
acc3_train2 = accuracy_score(y3_train, y_pred_train2)
prec3_train2 = precision_score(y3_train, y_pred_train2)
rec3_train2 = recall_score(y3_train, y_pred_train2)
auc3_train2 = roc_auc_score(y3_train, y_proba_train2[:,1])
# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train2))
print("Precision: {:.4f}".format(prec3_train2))
print("Recall : {:.4f}".format(rec3_train2))
print("AUC : {:.4f}".format(auc3_train2))
print("")
# calculate the accuracy, precision, and recall scores
acc3_test2 = accuracy_score(y3_test, y_pred_test2)
prec3_test2 = precision_score(y3_test, y_pred_test2)
rec3_test2 = recall_score(y3_test, y_pred_test2)
auc3_test2 = roc_auc_score(y3_test, y_proba_test2[:,1])
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc3_test2))
print("Precision: {:.4f}".format(prec3_test2))
print("Recall : {:.4f}".format(rec3_test2))
print("AUC : {:.4f}".format(auc3_test2))
-- train set -- Accuracy : 0.9431 Precision: 0.8873 Recall : 0.8710 AUC : 0.9891 -- test set -- Accuracy : 0.9364 Precision: 0.8785 Recall : 0.8868 AUC : 0.9867
# calculate the accuracy, precision, and recall scores
acc3_train12 = accuracy_score(y3_train, y_pred_train12)
prec3_train12 = precision_score(y3_train, y_pred_train12)
rec3_train12 = recall_score(y3_train, y_pred_train12)
auc3_train12 = roc_auc_score(y3_train, y_proba_train12[:,1])
# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train12))
print("Precision: {:.4f}".format(prec3_train12))
print("Recall. : {:.4f}".format(rec3_train12))
print("AUC : {:.4f}".format(auc3_train12))
print("")
# calculate the accuracy, precision, and recall scores
acc3_test12 = accuracy_score(y3_test, y_pred_test12)
prec3_test12 = precision_score(y3_test, y_pred_test12)
rec3_test12 = recall_score(y3_test, y_pred_test12)
auc3_test12 = roc_auc_score(y3_test, y_proba_test12[:,1])
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc3_test12))
print("Precision: {:.4f}".format(prec3_test12))
print("Recall. : {:.4f}".format(rec3_test12))
print("AUC : {:.4f}".format(auc3_test12))
-- train set -- Accuracy : 0.9092 Precision: 0.8895 Recall. : 0.7051 AUC : 0.9675 -- test set -- Accuracy : 0.8957 Precision: 0.8495 Recall. : 0.7453 AUC : 0.9632
from IPython.core.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:90% }</style>"))
import warnings
warnings.filterwarnings('ignore')
# ------------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
df.columns
Index(['id', 'listing_url', 'name', 'summary', 'space', 'description',
'experiences_offered', 'neighborhood_overview', 'notes', 'transit',
'host_id', 'host_name', 'host_since', 'host_location', 'host_about',
'host_response_time', 'host_response_rate', 'host_is_superhost',
'host_listings_count', 'host_has_profile_pic', 'host_identity_verified',
'neighbourhood', 'city', 'property_type', 'room_type', 'accommodates',
'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet',
'price', 'weekly_price', 'security_deposit', 'cleaning_fee',
'guests_included', 'extra_people', 'minimum_nights', 'has_availability',
'availability_30', 'availability_60', 'availability_90',
'availability_365', 'number_of_reviews', 'review_scores_rating',
'review_scores_accuracy', 'review_scores_cleanliness',
'review_scores_checkin', 'review_scores_communication',
'review_scores_location', 'review_scores_value', 'instant_bookable',
'cancellation_policy', 'booked'],
dtype='object')
Create new dataframe to avoid confusion on my end
pred_cluster = df[[ 'host_response_rate', 'host_is_superhost',
'host_listings_count', 'host_has_profile_pic', 'host_identity_verified',
'accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet', 'price',
'weekly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
'minimum_nights', 'has_availability', 'availability_30', 'availability_60', 'availability_90',
'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
'review_scores_location', 'review_scores_value', 'instant_bookable', 'booked']
]
get only numeric columns because this is a K MEANS clustering
#Standardize Data
dfcolumns = pred_cluster.columns.values.tolist()
scaler = StandardScaler()
df_scaled = scaler.fit_transform(pred_cluster.to_numpy())
df_scaled = pd.DataFrame(df_scaled, columns=dfcolumns)
df_scaled.head()
| host_response_rate | host_is_superhost | host_listings_count | host_has_profile_pic | host_identity_verified | accommodates | bathrooms | bedrooms | beds | square_feet | ... | number_of_reviews | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | booked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.452536 | 1.660044 | -0.090166 | 0.027671 | 0.483925 | -0.892166 | -0.642284 | -0.651249 | -0.782890 | 1.145794e-15 | ... | -0.298505 | 0.823894 | 5.312191e-01 | 5.790691e-01 | 3.219625e-01 | 0.303469 | 0.752130 | 8.037821e-01 | 2.756571 | -0.572933 |
| 1 | 0.452536 | -0.602394 | -0.163047 | 0.027671 | 0.483925 | 4.121610 | 2.321408 | 3.009752 | 5.296948 | 1.145794e-15 | ... | -0.468815 | 0.823894 | 5.312191e-01 | 5.790691e-01 | 3.219625e-01 | 0.303469 | 0.752130 | 8.037821e-01 | -0.362770 | -0.572933 |
| 2 | 0.452536 | 1.660044 | -0.090166 | 0.027671 | 0.483925 | -0.892166 | -0.642284 | -0.651249 | -0.782890 | 1.145794e-15 | ... | -0.264443 | 0.823894 | 5.312191e-01 | 5.790691e-01 | 3.219625e-01 | 0.303469 | 0.752130 | 8.037821e-01 | -0.362770 | -0.572933 |
| 3 | 0.452536 | -0.602394 | -0.163047 | 0.027671 | 0.483925 | -0.892166 | -0.642284 | -0.651249 | -0.782890 | 1.145794e-15 | ... | -0.605063 | 0.000000 | -3.150333e-15 | -2.536195e-15 | -4.803236e-15 | 0.000000 | 0.000000 | 2.753072e-15 | 2.756571 | -0.572933 |
| 4 | 0.000000 | -0.602394 | -0.163047 | 0.027671 | 0.483925 | -1.250293 | -0.642284 | -0.651249 | -0.174906 | 1.145794e-15 | ... | -0.536939 | -3.439806 | -1.242261e+00 | -8.486816e-01 | 3.219625e-01 | -2.555532 | -0.814807 | -7.460599e-01 | -0.362770 | -0.572933 |
5 rows × 32 columns
# set the model type to k-means
model = KMeans()
# generate an elbow plot for k = 2 to 12 clusters using the scaled data
visualizer = KElbowVisualizer(model, k=(2, 12))
visualizer.fit(df_scaled)
visualizer.show()
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
# Create an instance of KMeans with the desired value of k and fit to the scaled data
kmeans = KMeans(n_clusters=4, random_state=904)
kmclus=kmeans.fit(df_scaled)
clusters = kmeans.fit_predict(df_scaled)
#append cluster to original data
pred_cluster['cluster']=clusters
pred_booked.info()
# Evaluate cluster sizes to insure that they are similar
print(pred_cluster['cluster'].value_counts())
# Examine the means of each variable by cluster
df_desc = pred_cluster.groupby('cluster').mean()
print(df_desc)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 44 to 5830
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 property_type 1307 non-null object
1 room_type 1307 non-null object
2 accommodates 1307 non-null int64
3 bathrooms 1307 non-null float64
4 bedrooms 1307 non-null float64
5 beds 1307 non-null float64
6 bed_type 1307 non-null object
7 price 1307 non-null float64
8 security_deposit 1307 non-null float64
9 cleaning_fee 1307 non-null float64
10 number_of_reviews 1307 non-null int64
11 review_scores_location 1307 non-null float64
12 review_scores_value 1307 non-null float64
13 square_feet 1307 non-null float64
14 host_response_time 1115 non-null object
15 host_response_rate 1307 non-null float64
16 host_is_superhost 1307 non-null int64
17 neighbourhood 1307 non-null object
18 review_scores_rating 1307 non-null float64
19 review_scores_accuracy 1307 non-null float64
20 review_scores_cleanliness 1307 non-null float64
21 cancellation_policy 1307 non-null object
22 minimum_nights 1307 non-null int64
23 availability_30 1307 non-null int64
24 availability_60 1307 non-null int64
25 availability_365 1307 non-null int64
dtypes: float64(13), int64(7), object(6)
memory usage: 275.7+ KB
0 681
1 310
2 211
3 105
Name: cluster, dtype: int64
host_response_rate host_is_superhost host_listings_count \
cluster
0 95.566201 0.301028 2.559471
1 95.426773 0.212903 2.258065
2 94.891790 0.345972 5.099526
3 93.589716 0.038095 6.780952
host_has_profile_pic host_identity_verified accommodates \
cluster
0 0.998532 0.787078 3.324523
1 1.000000 0.787097 4.158065
2 1.000000 0.900474 8.706161
3 1.000000 0.847619 4.571429
bathrooms bedrooms beds square_feet ... number_of_reviews \
cluster ...
0 1.160059 1.243759 1.665198 1122.228648 ... 22.967695
1 1.379032 1.641935 2.035484 1144.669580 ... 9.506452
2 2.433649 3.379147 4.720379 1232.594566 ... 14.530806
3 1.357143 1.600000 2.180952 1139.206866 ... 14.885714
review_scores_rating review_scores_accuracy \
cluster
0 97.148616 9.812407
1 96.393516 9.754662
2 97.410680 9.797694
3 86.238095 8.619048
review_scores_cleanliness review_scores_checkin \
cluster
0 9.743037 9.952098
1 9.611802 9.924033
2 9.726247 9.940981
3 8.314286 9.171429
review_scores_communication review_scores_location \
cluster
0 9.962920 9.611043
1 9.927735 9.585290
2 9.947497 9.464834
3 9.238095 8.847619
review_scores_value instant_bookable booked
cluster
0 9.634400 0.136564 0.001468
1 9.507016 0.087097 0.961290
2 9.473744 0.085308 0.085308
3 8.428571 0.133333 0.057143
[4 rows x 32 columns]
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Highest Nightly Rates")
sns.displot(data=pred_cluster, x="price", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Highest Nightly Rates
<Figure size 800x400 with 0 Axes>
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Review Scores Rating")
sns.displot(data=pred_cluster, x="review_scores_rating", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Review Scores Rating
<Figure size 800x400 with 0 Axes>
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Review Scores Value")
sns.displot(data=pred_cluster, x="review_scores_value", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Review Scores Value
<Figure size 800x400 with 0 Axes>
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Booked")
sns.displot(data=pred_cluster, x="booked", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Booked
<Figure size 800x400 with 0 Axes>
df2 = pred_cluster
#split the data into groups based on clusters
c0 = df2[df2['cluster'] == 0]
c1 = df2[df2['cluster'] == 1]
c2 = df2[df2['cluster'] == 2]
c3 = df2[df2['cluster'] == 3]
# split the data into training and testing sets
X4 = c3.drop(['booked', 'cluster', 'availability_90'], axis=1)
y4 = c3['booked']
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.3, random_state=42)
X4_train.info()
y4_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 73 entries, 5800 to 5774 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_response_rate 73 non-null float64 1 host_is_superhost 73 non-null int64 2 host_listings_count 73 non-null float64 3 host_has_profile_pic 73 non-null int64 4 host_identity_verified 73 non-null int64 5 accommodates 73 non-null int64 6 bathrooms 73 non-null float64 7 bedrooms 73 non-null float64 8 beds 73 non-null float64 9 square_feet 73 non-null float64 10 price 73 non-null float64 11 weekly_price 73 non-null float64 12 security_deposit 73 non-null float64 13 cleaning_fee 73 non-null float64 14 guests_included 73 non-null int64 15 extra_people 73 non-null float64 16 minimum_nights 73 non-null int64 17 has_availability 73 non-null int64 18 availability_30 73 non-null int64 19 availability_60 73 non-null int64 20 availability_365 73 non-null int64 21 number_of_reviews 73 non-null int64 22 review_scores_rating 73 non-null float64 23 review_scores_accuracy 73 non-null float64 24 review_scores_cleanliness 73 non-null float64 25 review_scores_checkin 73 non-null float64 26 review_scores_communication 73 non-null float64 27 review_scores_location 73 non-null float64 28 review_scores_value 73 non-null float64 29 instant_bookable 73 non-null int64 dtypes: float64(18), int64(12) memory usage: 17.7 KB <class 'pandas.core.series.Series'> Int64Index: 73 entries, 5800 to 5774 Series name: booked Non-Null Count Dtype -------------- ----- 73 non-null int64 dtypes: int64(1) memory usage: 1.1 KB
dt_tune = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X4_train, y4_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
{'ccp_alpha': 0, 'max_depth': 10, 'min_samples_leaf': 1}
# create an instance of a decision tree classifier using default values
dt = DecisionTreeClassifier(max_depth = None, min_samples_leaf=1, ccp_alpha = 0.001)
# fit the model to the training data
dt.fit(X4_train, y4_train)
DecisionTreeClassifier(ccp_alpha=0.001)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0.001)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
'tree.dot',
class_names=['0','1'],
feature_names = X4_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7d0b3e624190>
# make predictions on the training and test data
y4_pred_train = dt.predict(X4_train)
y4_pred_test = dt.predict(X4_test)
y4_prob_train = dt.predict_proba(X4_train)
y4_prob_test = dt.predict_proba(X4_test)
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y4_train, y4_pred_train)
prec_train = precision_score(y4_train, y4_pred_train)
rec_train = recall_score(y4_train, y4_pred_train)
# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y4_test, y4_pred_test)
prec_test = precision_score(y4_test, y4_pred_test)
rec_test = recall_score(y4_test, y4_pred_test)
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 1.0000 Precision: 1.0000 Recall. : 1.0000 -- test set -- Accuracy : 0.9688 Precision: 1.0000 Recall. : 0.5000
# split the data into training and testing sets
X5 = c2.drop(['booked', 'cluster', 'availability_90'], axis=1)
y5 = c2['booked']
X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size=0.3, random_state=42)
X5_train.info()
y5_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 147 entries, 5557 to 2503 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_response_rate 147 non-null float64 1 host_is_superhost 147 non-null int64 2 host_listings_count 147 non-null float64 3 host_has_profile_pic 147 non-null int64 4 host_identity_verified 147 non-null int64 5 accommodates 147 non-null int64 6 bathrooms 147 non-null float64 7 bedrooms 147 non-null float64 8 beds 147 non-null float64 9 square_feet 147 non-null float64 10 price 147 non-null float64 11 weekly_price 147 non-null float64 12 security_deposit 147 non-null float64 13 cleaning_fee 147 non-null float64 14 guests_included 147 non-null int64 15 extra_people 147 non-null float64 16 minimum_nights 147 non-null int64 17 has_availability 147 non-null int64 18 availability_30 147 non-null int64 19 availability_60 147 non-null int64 20 availability_365 147 non-null int64 21 number_of_reviews 147 non-null int64 22 review_scores_rating 147 non-null float64 23 review_scores_accuracy 147 non-null float64 24 review_scores_cleanliness 147 non-null float64 25 review_scores_checkin 147 non-null float64 26 review_scores_communication 147 non-null float64 27 review_scores_location 147 non-null float64 28 review_scores_value 147 non-null float64 29 instant_bookable 147 non-null int64 dtypes: float64(18), int64(12) memory usage: 35.6 KB <class 'pandas.core.series.Series'> Int64Index: 147 entries, 5557 to 2503 Series name: booked Non-Null Count Dtype -------------- ----- 147 non-null int64 dtypes: int64(1) memory usage: 2.3 KB
dt_tune = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X5_train, y5_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 10}
# create an instance of a decision tree classifier using default values
dt = DecisionTreeClassifier(max_depth = None, min_samples_leaf=10, ccp_alpha = 0.001)
# fit the model to the training data
dt.fit(X5_train, y5_train)
DecisionTreeClassifier(ccp_alpha=0.001, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0.001, min_samples_leaf=10)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
'tree.dot',
class_names=['0','1'],
feature_names = X5_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7d0b3e317430>
# make predictions on the training and test data
y5_pred_train = dt.predict(X5_train)
y5_pred_test = dt.predict(X5_test)
y5_prob_train = dt.predict_proba(X5_train)
y5_prob_test = dt.predict_proba(X5_test)
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y5_train, y5_pred_train)
prec_train = precision_score(y5_train, y5_pred_train)
rec_train = recall_score(y5_train, y5_pred_train)
# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y5_test, y5_pred_test)
prec_test = precision_score(y5_test, y5_pred_test)
rec_test = recall_score(y5_test, y5_pred_test)
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.9864 Precision: 0.9286 Recall. : 0.9286 -- test set -- Accuracy : 0.9844 Precision: 1.0000 Recall. : 0.7500
# split the data into training and testing sets
X6 = c1.drop(['booked', 'cluster', 'availability_90'], axis=1)
y6 = c1['booked']
X6_train, X6_test, y6_train, y6_test = train_test_split(X6, y6, test_size=0.3, random_state=42)
X6_train.info()
y6_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 217 entries, 5490 to 1730 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_response_rate 217 non-null float64 1 host_is_superhost 217 non-null int64 2 host_listings_count 217 non-null float64 3 host_has_profile_pic 217 non-null int64 4 host_identity_verified 217 non-null int64 5 accommodates 217 non-null int64 6 bathrooms 217 non-null float64 7 bedrooms 217 non-null float64 8 beds 217 non-null float64 9 square_feet 217 non-null float64 10 price 217 non-null float64 11 weekly_price 217 non-null float64 12 security_deposit 217 non-null float64 13 cleaning_fee 217 non-null float64 14 guests_included 217 non-null int64 15 extra_people 217 non-null float64 16 minimum_nights 217 non-null int64 17 has_availability 217 non-null int64 18 availability_30 217 non-null int64 19 availability_60 217 non-null int64 20 availability_365 217 non-null int64 21 number_of_reviews 217 non-null int64 22 review_scores_rating 217 non-null float64 23 review_scores_accuracy 217 non-null float64 24 review_scores_cleanliness 217 non-null float64 25 review_scores_checkin 217 non-null float64 26 review_scores_communication 217 non-null float64 27 review_scores_location 217 non-null float64 28 review_scores_value 217 non-null float64 29 instant_bookable 217 non-null int64 dtypes: float64(18), int64(12) memory usage: 52.6 KB <class 'pandas.core.series.Series'> Int64Index: 217 entries, 5490 to 1730 Series name: booked Non-Null Count Dtype -------------- ----- 217 non-null int64 dtypes: int64(1) memory usage: 3.4 KB
dt_tune = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X6_train, y6_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
{'ccp_alpha': 0.01, 'max_depth': None, 'min_samples_leaf': 1}
# create an instance of a decision tree classifier using default values
dt = DecisionTreeClassifier(max_depth = None, min_samples_leaf=1, ccp_alpha = 0.001)
# fit the model to the training data
dt.fit(X6_train, y6_train)
DecisionTreeClassifier(ccp_alpha=0.001)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0.001)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
'tree.dot',
class_names=['0','1'],
feature_names = X6_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7d0b3e0f5b70>
# make predictions on the training and test data
y6_pred_train = dt.predict(X6_train)
y6_pred_test = dt.predict(X6_test)
y6_prob_train = dt.predict_proba(X6_train)
y6_prob_test = dt.predict_proba(X6_test)
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y6_train, y6_pred_train)
prec_train = precision_score(y6_train, y6_pred_train)
rec_train = recall_score(y6_train, y6_pred_train)
# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y6_test, y6_pred_test)
prec_test = precision_score(y6_test, y6_pred_test)
rec_test = recall_score(y6_test, y6_pred_test)
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 1.0000 Precision: 1.0000 Recall. : 1.0000 -- test set -- Accuracy : 0.9140 Precision: 0.9326 Recall. : 0.9765
# split the data into training and testing sets
X7 = c0.drop(['booked', 'cluster', 'availability_90'], axis=1)
y7 = c0['booked']
X7_train, X7_test, y7_train, y7_test = train_test_split(X7, y7, test_size=0.3, random_state=42)
X7_train.info()
y7_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 476 entries, 3763 to 921 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_response_rate 476 non-null float64 1 host_is_superhost 476 non-null int64 2 host_listings_count 476 non-null float64 3 host_has_profile_pic 476 non-null int64 4 host_identity_verified 476 non-null int64 5 accommodates 476 non-null int64 6 bathrooms 476 non-null float64 7 bedrooms 476 non-null float64 8 beds 476 non-null float64 9 square_feet 476 non-null float64 10 price 476 non-null float64 11 weekly_price 476 non-null float64 12 security_deposit 476 non-null float64 13 cleaning_fee 476 non-null float64 14 guests_included 476 non-null int64 15 extra_people 476 non-null float64 16 minimum_nights 476 non-null int64 17 has_availability 476 non-null int64 18 availability_30 476 non-null int64 19 availability_60 476 non-null int64 20 availability_365 476 non-null int64 21 number_of_reviews 476 non-null int64 22 review_scores_rating 476 non-null float64 23 review_scores_accuracy 476 non-null float64 24 review_scores_cleanliness 476 non-null float64 25 review_scores_checkin 476 non-null float64 26 review_scores_communication 476 non-null float64 27 review_scores_location 476 non-null float64 28 review_scores_value 476 non-null float64 29 instant_bookable 476 non-null int64 dtypes: float64(18), int64(12) memory usage: 115.3 KB <class 'pandas.core.series.Series'> Int64Index: 476 entries, 3763 to 921 Series name: booked Non-Null Count Dtype -------------- ----- 476 non-null int64 dtypes: int64(1) memory usage: 7.4 KB
dt_tune = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X7_train, y7_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 1}
# create an instance of a decision tree classifier using default values
dt = DecisionTreeClassifier(max_depth = 10, min_samples_leaf=1, ccp_alpha = 0.001)
# fit the model to the training data
dt.fit(X7_train, y7_train)
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=10)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
'tree.dot',
class_names=['0','1'],
feature_names = X7_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7d0b5ac89090>
# make predictions on the training and test data
y7_pred_train = dt.predict(X7_train)
y7_pred_test = dt.predict(X7_test)
y7_prob_train = dt.predict_proba(X7_train)
y7_prob_test = dt.predict_proba(X7_test)
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y7_train, y7_pred_train)
prec_train = precision_score(y7_train, y7_pred_train)
rec_train = recall_score(y7_train, y7_pred_train)
# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y7_test, y7_pred_test)
prec_test = precision_score(y7_test, y7_pred_test)
rec_test = recall_score(y7_test, y7_pred_test)
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 1.0000 Precision: 1.0000 Recall. : 1.0000 -- test set -- Accuracy : 0.9902 Precision: 0.0000 Recall. : 0.0000
Weird output